{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "from retrying import retry"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 防止访问失败的重试操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "@retry(stop_max_attempt_number=100) #最大重试100次，100次全部报错，才会报错\n",
    "def _open_url(url):\n",
    "    response = requests.get(url, headers={\n",
    "    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'\n",
    "    }, timeout=10) #超时的时候回报错并重试\n",
    "    assert response.status_code == 200 #状态码不是200，也会报错并充实\n",
    "    return response"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 得到页面上符合条件的链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_menu(url):#得到页面上的超链接目录\n",
    "    response = _open_url(url)\n",
    "    response.encoding = 'utf-8'\n",
    "    data = response.text\n",
    "    soup = BeautifulSoup(data,features=\"lxml\")\n",
    "    urls=[]\n",
    "    for item in soup.find_all(\"a\"):\n",
    "        if item.string == None:\n",
    "            continue\n",
    "        else:\n",
    "            urls.append('https://www.xxx.net'+item.get(\"href\"))\n",
    "    \n",
    "    urls_real=[]\n",
    "    for line in urls:\n",
    "        if url in line:\n",
    "            urls_real.append(line)\n",
    "        \n",
    "#     for i in range(len(urls_real)):\n",
    "#         print(str(urls_real[i]))\n",
    "    return urls_real"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 无关文字过滤器，顺便实现了一些排版功能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textfilter(text,title):# 注意顺序\n",
    "    text = text.replace(\"天才一秒记住本站地址：[笔趣阁]\",\"\")\n",
    "    text = text.replace(\"https://www.xxx.net/最快更新！无广告！\",\"\")\n",
    "    text = text.replace(\"http://www.xxx.net/最快更新！无广告！\",\"\")\n",
    "    text = text.replace(\"章节错误,点此报送(免注册),\",\"\")\n",
    "    text = text.replace(\"报送后维护人员会在两分钟内校正章节内容,请耐心等待。\",\"\")\n",
    "    text = text.replace(\"\\xa0\\xa0\\xa0\\xa0\",\"\\n\")\n",
    "    text = text.replace(\"\\n\\r\\n\\n\\r\\n\\n\\r\\n            \\n\",\"\")\n",
    "    text = text.replace(\"\\n\\r\\n                \\n\\r\\n                \\n\\r\\n            \\n\",\"\")\n",
    "    text = text.replace(\"           \\r\\n            \\n\\r\\n                \\r\\n            \\n\",\"\")\n",
    "    text = text.replace(\"\\xa0\",\" \")\n",
    "    text = text.replace(title,\"\")# 正文里面有标题，让我们很生气地删掉！\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 得到章节的标题和内容"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text(url): #得到页面上的目录\n",
    "    response = _open_url(url)\n",
    "    response.encoding = 'utf-8'\n",
    "    data = response.text\n",
    "    soup = BeautifulSoup(data,features=\"lxml\")\n",
    "    title = soup.find('h1').get_text()\n",
    "    text = soup.find(id='content').get_text()\n",
    "    text = textfilter(text,title)# 过滤无关文字\n",
    "    return [title,text]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 开始读取内容，并且存储"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "get_text(menu[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_to_host(url = 'https://www.xxx.net/84_84792/',bookname = '史上最强炼气期',path = r'./'):\n",
    "    menu = get_menu(url)\n",
    "    menu = menu[13:] # 去掉前面章节\n",
    "    for url in menu:\n",
    "        [title,text] = get_text(url)\n",
    "        with open('./%s.txt' % bookname,'a+',encoding='utf-8') as f:\n",
    "            f.write(title+'\\n'+text+'\\n\\n')\n",
    "        print(\"%s  储存完成！\" % title)\n",
    "    return 1"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
